### Data Analaysis
import numpy as np
import pandas as pd
### Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
from scipy.interpolate import make_interp_spline, BSpline
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=Warning)
pd.set_option('display.max_columns', None)
pd.options.plotting.backend = "plotly"
confirmed_cases_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
recovered_cases_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")
death_cases_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
confirmed_cases_df.head()
confirmed_cases_df.info()
confirmed_country = confirmed_cases_df.groupby('Country/Region').sum().drop(['Lat','Long'],axis=1)
confirmed_country.head()
confirmed_country = confirmed_country.transpose()
confirmed_country.index
confirmed_country.index.dtype
# Set index as DateTimeIndex
datetime_index = pd.DatetimeIndex(confirmed_country.index)
confirmed_country.set_index(datetime_index, inplace=True)
# Check out index
confirmed_country.index
countries = ['US','Brazil','Russia','India','United Kingdom','Spain','Italy','France','Iran','China']
confirmed_country[countries].plot(labels=dict(index="Date", value="Reported Confirmed cases count"),
title="Reported Confirmed Cases Time Series")
countries = ['US','Brazil','Russia','India','United Kingdom','Spain','Italy','France','Iran','China']
confirmed_country[countries].plot(labels=dict(index="Date", value="Reported Confirmed cases count"),
title="Reported Confirmed Cases Time Series", log_y=True)
#Latest date
confirmed_country_temp = confirmed_country.iloc[confirmed_country.shape[0]-1].to_frame()
confirmed_country_temp.columns = ['count']
confirmed_country_temp = confirmed_country_temp.reset_index()
fig = px.choropleth(confirmed_country_temp,
locations="Country/Region",
locationmode='country names',
color="count",
hover_name="Country/Region",
range_color=[1,500000],
color_continuous_scale="peach",
title='Countries with Confirmed Cases')
fig.show()
europe = list(['Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Ireland',
'Italy', 'Latvia','Luxembourg','Lithuania','Malta','Norway','Netherlands','Poland','Portugal','Romania','Slovakia','Slovenia',
'Spain', 'Sweden', 'United Kingdom', 'Iceland', 'Russia', 'Switzerland', 'Serbia', 'Ukraine', 'Belarus',
'Albania', 'Bosnia and Herzegovina', 'Kosovo', 'Moldova', 'Montenegro', 'North Macedonia'])
europe_confirmed_country_temp = confirmed_country_temp[confirmed_country_temp["Country/Region"].isin(europe)]
europe_confirmed_country_temp.head()
fig = px.choropleth(europe_confirmed_country_temp,
locations="Country/Region",
locationmode='country names',
color="count",
hover_name="Country/Region", range_color=[1,200000],
color_continuous_scale='portland',
title='European Countries with Confirmed Cases',
scope='europe', height=800)
fig.show()
confirmed_country_time = confirmed_country.unstack()
confirmed_country_time = confirmed_country_time.reset_index()
confirmed_country_time.columns = ["country","date","confirmed"]
confirmed_country_time['date'] = pd.to_datetime(confirmed_country_time['date'])
confirmed_country_time['date'] = confirmed_country_time['date'].dt.strftime('%m/%d/%Y')
confirmed_country_time.head()
fig = px.scatter_geo(confirmed_country_time, locations="country",
locationmode='country names',
color="confirmed", size='confirmed',
hover_name="country",
range_color= [0, 100000],
projection="natural earth",
animation_frame="date",
title='COVID-19: Confirmed cases Spread Over Time',
color_continuous_scale="portland")
fig.show()
temp = confirmed_cases_df.groupby('Country/Region').sum().drop(["Lat","Long"],axis =1) \
.sort_values(confirmed_cases_df.columns[-1], ascending= False)
threshold = 50
f = plt.figure(figsize=(15,12))
ax = f.add_subplot(111)
for i,country in enumerate(temp.index):
if i >= 9:
if country != "India" and country != "Japan" :
continue
days = 90
t = temp.loc[temp.index== country].values[0]
t = t[t>threshold][:days]
date = np.arange(0,len(t[:days]))
xnew = np.linspace(date.min(), date.max(), 30)
spl = make_interp_spline(date, t, k=1) # type: BSpline
power_smooth = spl(xnew)
if country != "India":
plt.plot(xnew,power_smooth,'-o',label = country,linewidth =3, markevery=[-1])
else:
marker_style = dict(linewidth=4, linestyle='-', marker='o',markersize=10, markerfacecolor='#ffffff')
plt.plot(date,t,"-.",label = country,**marker_style)
plt.tick_params(labelsize = 14)
plt.xticks(np.arange(0,days,7),[ "Day "+str(i) for i in range(days)][::7])
# Reference lines
x = np.arange(0,18)
y = 2**(x+np.log2(threshold))
plt.plot(x,y,"--",linewidth =2,color = "gray")
plt.annotate("No. of cases doubles every day",(x[-2],y[-1]),xycoords="data",fontsize=14,alpha = 0.5)
x = np.arange(0,int(days-22))
y = 2**(x/2+np.log2(threshold))
plt.plot(x,y,"--",linewidth =2,color = "gray")
plt.annotate(".. every second day",(x[-3],y[-1]),xycoords="data",fontsize=14,alpha = 0.5)
x = np.arange(0,int(days-4))
y = 2**(x/7+np.log2(threshold))
plt.plot(x,y,"--",linewidth =2,color = "gray")
plt.annotate(".. every week",(x[-3],y[-1]),xycoords="data",fontsize=14,alpha = 0.5)
x = np.arange(0,int(days-4))
y = 2**(x/30+np.log2(threshold))
plt.plot(x,y,"--",linewidth =2,color = "gray")
plt.annotate(".. every month",(x[-3],y[-1]),xycoords="data",fontsize=14,alpha = 0.5)
# India is following trend similar to doulbe the cases in 4 days but it may increase the rate
x = np.arange(0,int(days-5))
y = 2**(x/4+np.log2(threshold))
plt.plot(x,y,"--",linewidth =2,color = "Red")
plt.annotate(".. every 4 days",(x[-3],y[-1]),color="Red",xycoords="data",fontsize=14,alpha = 0.8)
# plot Params
plt.xlabel("Days",fontsize=17)
plt.ylabel("Number of Confirmed Cases",fontsize=17)
plt.title("Trend Comparison of Confirmed Cases for Different Countries",fontsize=22)
plt.legend(loc = "upper left")
plt.yscale("log")
plt.grid(which="both")
#plt.savefig('Trend Comparison with India (confirmed).png')
plt.show()
deaths_country = death_cases_df.groupby('Country/Region').sum().drop(['Lat','Long'],axis=1)
deaths_country = deaths_country.transpose()
# Set index as DateTimeIndex
datetime_index = pd.DatetimeIndex(deaths_country.index)
deaths_country.set_index(datetime_index, inplace=True)
deaths_country.head()
countries = ['US','Brazil','Russia','India','United Kingdom','Spain','Italy','France','Iran','China']
deaths_country[countries].plot(labels=dict(index="Date", value="Reported Death cases count"),
title="Reported Death Cases Time Series")
countries = ['US','Brazil','Russia','India','United Kingdom','Spain','Italy','France','Iran','China']
deaths_country[countries].plot(labels=dict(index="Date", value="Reported Death cases count"),
title="Reported Death Cases Time Series", log_y=True)
deaths_country_time = deaths_country.unstack()
deaths_country_time = deaths_country_time.reset_index()
deaths_country_time.columns = ["country","date","death"]
deaths_country_time['date'] = pd.to_datetime(deaths_country_time['date'])
deaths_country_time['date'] = deaths_country_time['date'].dt.strftime('%m/%d/%Y')
deaths_country_time.head()
fig = px.scatter_geo(deaths_country_time, locations="country",
locationmode='country names',
color="death", size='death',
hover_name="country",
range_color= [0, 10000],
projection="natural earth",
animation_frame="date",
title='COVID-19: Death Cases Spread Over Time',
color_continuous_scale="portland")
fig.show()
recovered_country = recovered_cases_df.groupby('Country/Region').sum().drop(['Lat','Long'],axis=1)
recovered_country = recovered_country.transpose()
# Set index as DateTimeIndex
datetime_index = pd.DatetimeIndex(recovered_country.index)
recovered_country.set_index(datetime_index, inplace=True)
recovered_country.head()
countries = ['US','Brazil','Russia','India','United Kingdom','Spain','Italy','France','Iran','China']
recovered_country[countries].plot(labels=dict(index="Date", value="Recovered cases count"),
title="Recovered Cases Time Series")
countries = ['US','Brazil','Russia','India','United Kingdom','Spain','Italy','France','Iran','China']
recovered_country[countries].plot(labels=dict(index="Date", value="Recovered cases count"),
title="Recovered Cases Time Series", log_y=True)
recovered_country_time = recovered_country.unstack()
recovered_country_time = recovered_country_time.reset_index()
recovered_country_time.columns = ["country","date","recovered"]
recovered_country_time['date'] = pd.to_datetime(recovered_country_time['date'])
recovered_country_time['date'] = recovered_country_time['date'].dt.strftime('%m/%d/%Y')
recovered_country_time.head()
fig = px.scatter_geo(recovered_country_time, locations="country",
locationmode='country names',
color="recovered", size='recovered',
hover_name="country",
range_color= [0, 10000],
projection="natural earth",
animation_frame="date",
title='COVID-19: Recoveries Spread Over Time',
color_continuous_scale="portland")
fig.show()
def remove_dup_columns(frame):
keep_names = set()
keep_icols = list()
for icol, name in enumerate(frame.columns):
if name not in keep_names:
keep_names.add(name)
keep_icols.append(icol)
return frame.iloc[:, keep_icols]
combined_country = pd.concat([confirmed_country_time, deaths_country_time, recovered_country_time],axis=1)[['country','date','confirmed','death','recovered']]
combined_country = remove_dup_columns(combined_country)
combined_country['active'] = combined_country['confirmed'] - combined_country['death'] - combined_country['recovered']
combined_country['mortality_rate'] = np.round((combined_country['death']/combined_country['confirmed'])*100,2)
combined_country.tail()
x = combined_country['date']
y = combined_country['confirmed']
z = combined_country['death']
w = combined_country['recovered']
a = combined_country['active']
#call Bar charts and assign to fig_t. Add traces to the same figure
fig_t = go.Figure(go.Bar(x=x, y=y, name='Total Confirmed', marker_color='indianred', opacity=.7))
fig_t.add_trace(go.Bar(x=x, y=a, name='Total Active', marker_color='mediumblue', opacity=0.7))
fig_t.add_trace(go.Bar(x=x, y=w, name='Total recovered', marker_color='lightseagreen', opacity=0.7))
fig_t.add_trace(go.Bar(x=x, y=z, name='Total deaths', marker_color='gray', opacity=1))
#here we define layout of the chart
fig_t.update_layout(barmode='overlay',
xaxis={'categoryorder':'total ascending'},
xaxis_type='category',
title={
'text': 'Cummulative COVID-19 Trend',
'y':0.79,
'x':0.45,
'xanchor': 'center',
'yanchor': 'top'},)
fig_t.update_xaxes(title= '----->Timeline' ,showline=True)
fig_t.update_yaxes(title= '----->Number of cases', showline=True)
fig_t.show()
x = combined_country['date']
m = combined_country['mortality_rate']
fig_t = go.Figure(go.Bar(x=x, y=m, name='Mortality Rate', marker_color='indianred', opacity=.7))
#here we define layout of the chart
fig_t.update_layout(barmode='overlay',
xaxis={'categoryorder':'total ascending'},
xaxis_type='category',
title={
'text': 'Cummulative Mortality Rate Trend',
'y':0.79,
'x':0.45,
'xanchor': 'center',
'yanchor': 'top'},)
fig_t.update_xaxes(title= '----->Timeline' ,showline=True)
fig_t.update_yaxes(title= '----->Mortality Rate', showline=True)
fig_t.show()
india_data = combined_country[combined_country["country"]=='India']
x = india_data['date']
y = india_data['confirmed']
z = india_data['death']
w = india_data['recovered']
a = india_data['active']
#call Bar charts and assign to fig_t. Add traces to the same figure
fig_t = go.Figure(go.Bar(x=x, y=y, name='Total Confirmed', marker_color='indianred', opacity=.7))
fig_t.add_trace(go.Bar(x=x, y=a, name='Total Active', marker_color='mediumblue', opacity=0.7))
fig_t.add_trace(go.Bar(x=x, y=w, name='Total recovered', marker_color='lightseagreen', opacity=0.7))
fig_t.add_trace(go.Bar(x=x, y=z, name='Total deaths', marker_color='gray', opacity=1))
#here we define layout of the chart
fig_t.update_layout(barmode='overlay',
xaxis={'categoryorder':'total ascending'},
xaxis_type='category',
title={
'text': 'Cummulative COVID-19 Trend of India',
'y':0.79,
'x':0.45,
'xanchor': 'center',
'yanchor': 'top'},)
fig_t.update_xaxes(title= '----->Timeline' ,showline=True)
fig_t.update_yaxes(title= '----->Number of cases', showline=True)
fig_t.show()
confirmed_india = confirmed_country['India'].reset_index().rename(columns={'index':'ds','India':'y'})
confirmed_india['ds'] = pd.to_datetime(confirmed_india['ds'])
confirmed_india.head()
Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.
Prophet is open source software released by Facebook’s Core Data Science team.
from fbprophet import Prophet
model = Prophet(interval_width=0.95)
model.fit(confirmed_india)
future_7 = model.make_future_dataframe(periods=7)
future_7.tail()
#predicting the future with date, upper and lower limit of y values
forecast = model.predict(future_7)
forecast[['ds','yhat','yhat_lower','yhat_upper']].tail()
confirmed_forecast_plot = model.plot(forecast)
confirmed_forecast_plot = model.plot_components(forecast)